/**************************************************************************************
* Copyright (c) 2020 Institute of Computing Technology, CAS
* Copyright (c) 2020 University of Chinese Academy of Sciences
*
* NutShell is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*             http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR
* FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/

package nutcore

import chisel3._
import chisel3.util._
import chisel3.util.experimental.BoringUtils

import utils._
import bus.simplebus._
import difftest._

trait HasBackendConst{
  // val multiIssue = true
  val robSize = 16
  val robWidth = 2
  val robInstCapacity = robSize * robWidth
  val checkpointSize = 4 // register map checkpoint size
  val brTagWidth = log2Up(checkpointSize)
  val prfAddrWidth = log2Up(robSize) + log2Up(robWidth) // physical rf addr width

  val DispatchWidth = 2
  val CommitWidth = 2
  val RetireWidth = 2

  val enableCheckpoint = true
}

// NutShell/Argo Out Of Order Execution Backend
class Backend_ooo(implicit val p: NutCoreConfig) extends NutCoreModule with HasRegFileParameter with HasBackendConst{

  val io = IO(new Bundle {
    // EXU
    val in = Vec(2, Flipped(Decoupled(new DecodeIO)))
    val flush = Input(Bool())
    val dmem = new SimpleBusUC(addrBits = VAddrBits, userBits = DCacheUserBundleWidth)
    val dtlb = new SimpleBusUC(addrBits = VAddrBits, userBits = DCacheUserBundleWidth)

    val memMMU = Flipped(new MemMMUIO)

    // WBU
    val redirect = new RedirectIO
  })

  // For current version:
  // There is only 1 BRU
  // There is only 1 LSU

  val cdb = Wire(Vec(CommitWidth, Valid(new OOCommitIO)))
  val rf = new RegFile
  val rob = Module(new ROB)

  val brurs  = Module(new RS(priority = true, size = checkpointSize, checkpoint = true, name = "BRURS"))
  val alu1rs = Module(new RS(priority = true, size = 4, name = "ALU1RS"))
  val alu2rs = Module(new RS(priority = true, size = 4, name = "ALU2RS"))
  val csrrs  = Module(new RS(priority = true, size = 1, name = "CSRRS")) // CSR & MOU
  val lsurs  = Module(new RS(storeSeq = true, size = 4, name = "LSURS")) // FIXIT: out of order l/s disabled
  val mdurs  = Module(new RS(priority = true, size = 4, pipelined = false, name = "MDURS"))

  val bruDelayer = Module(new WritebackDelayer(bru = true))
  val mduDelayer = Module(new WritebackDelayer())

  val instCango = Wire(Vec(DispatchWidth + 1, Bool()))
  val bruRedirect = Wire(new RedirectIO)
  val mispredictRec = Wire(new MisPredictionRecIO)
  val flushBackend = if(enableCheckpoint){
    io.flush
  } else {
    io.flush || rob.io.redirect.valid && rob.io.redirect.rtype === 1.U
  }

  io.redirect := Mux(rob.io.redirect.valid && rob.io.redirect.rtype === 0.U, rob.io.redirect, bruRedirect)

  rob.io.cdb <> cdb
  rob.io.mispredictRec := mispredictRec
  rob.io.flush := flushBackend
  when (rob.io.wb(0).rfWen) { rf.write(rob.io.wb(0).rfDest, rob.io.wb(0).rfData) }
  when (rob.io.wb(1).rfWen) { rf.write(rob.io.wb(1).rfDest, rob.io.wb(1).rfData) }
  List.tabulate(DispatchWidth)(i => {
    rob.io.in(i).valid := io.in(i).valid && instCango(i)
    io.in(i).ready := rob.io.in(i).ready && instCango(i)
    rob.io.in(i).bits := io.in(i).bits
  })

  brurs.io.updateCheckpoint.get <> rob.io.updateCheckpoint
  rob.io.recoverCheckpoint.bits := bruDelayer.io.freeCheckpoint.get.bits
  brurs.io.freeCheckpoint.get <> bruDelayer.io.freeCheckpoint.get
  if(enableCheckpoint){
    rob.io.recoverCheckpoint.valid := io.redirect.valid && io.redirect.rtype === 1.U
  } else {
    rob.io.recoverCheckpoint.valid := false.B
    rob.io.updateCheckpoint.valid := false.B
  }

  // ------------------------------------------------
  // Backend stage 1
  // Dispatch
  // ------------------------------------------------

  // Choose inst to be dispatched
  // Check structural hazard
  //TODO: Use bit instead of counter
  val mduCnt = Wire(UInt(2.W))
  mduCnt := List.tabulate(robWidth)(i => (io.in(i).valid && io.in(i).bits.ctrl.fuType === FuType.mdu)).foldRight(0.U)((sum, i) => sum +& i)
  val lsuCnt = Wire(UInt(2.W))
  lsuCnt := List.tabulate(robWidth)(i => (io.in(i).valid && io.in(i).bits.ctrl.fuType === FuType.lsu)).foldRight(0.U)((sum, i) => sum +& i)
  val bruCnt = Wire(UInt(2.W))
  bruCnt := List.tabulate(robWidth)(i => (io.in(i).valid && io.in(i).bits.ctrl.fuType === FuType.bru && ALUOpType.isBru(io.in(i).bits.ctrl.fuOpType))).foldRight(0.U)((sum, i) => sum +& i)
  val csrCnt = Wire(UInt(2.W))
  csrCnt := List.tabulate(robWidth)(i => (io.in(i).valid && (io.in(i).bits.ctrl.fuType === FuType.csr || io.in(i).bits.ctrl.fuType === FuType.mou))).foldRight(0.U)((sum, i) => sum +& i)

  val rfSrc = List(
    io.in(0).bits.ctrl.rfSrc1,
    io.in(0).bits.ctrl.rfSrc2,
    io.in(1).bits.ctrl.rfSrc1,
    io.in(1).bits.ctrl.rfSrc2
  )
  val rfDest = List(
    io.in(0).bits.ctrl.rfDest,
    io.in(1).bits.ctrl.rfDest
  )

  val inst = Wire(Vec(DispatchWidth + 1, new RenamedDecodeIO))

  List.tabulate(DispatchWidth)(i => {
    inst(i).decode := io.in(i).bits
    inst(i).prfDest := Cat(rob.io.index, i.U(1.W))
    inst(i).prfSrc1 := rob.io.aprf(2*i)
    inst(i).prfSrc2 := rob.io.aprf(2*i+1)
    inst(i).src1Rdy := !rob.io.rvalid(2*i) || rob.io.rcommited(2*i)
    inst(i).src2Rdy := !rob.io.rvalid(2*i+1) || rob.io.rcommited(2*i+1)
    inst(i).brMask := DontCare
    // read rf, update src
    inst(i).decode.data.src1 := rf.read(rfSrc(2*i))
    when(rob.io.rvalid(2*i) && rob.io.rcommited(2*i)){inst(i).decode.data.src1 := rob.io.rprf(2*i)}
    inst(i).decode.data.src2 := rf.read(rfSrc(2*i + 1))
    when(rob.io.rvalid(2*i+1) && rob.io.rcommited(2*i+1)){inst(i).decode.data.src2 := rob.io.rprf(2*i+1)}
  })
  inst(DispatchWidth) := DontCare

  def isDepend(rfSrc: UInt, rfDest: UInt, wen: Bool): Bool = (rfSrc =/= 0.U) && (rfSrc === rfDest) && wen
  def isDepend2(rfSrc: UInt, rfDest1: UInt, wen1: Bool, rfDest2: UInt, wen2: Bool): Bool = (rfSrc =/= 0.U) && ((rfSrc === rfDest1) && wen1 || (rfSrc === rfDest2) && wen2)

  // check dependency for insts at commit stage
  List.tabulate(DispatchWidth)(i => {
    List.tabulate(CommitWidth)(j => {
      when(inst(i).prfSrc1 === cdb(j).bits.prfidx && cdb(j).valid && cdb(j).bits.decode.ctrl.rfWen && rob.io.rvalid(2*i)){
        inst(i).src1Rdy := true.B
        inst(i).decode.data.src1 := cdb(j).bits.commits
      }
      when(inst(i).prfSrc2 === cdb(j).bits.prfidx && cdb(j).valid && cdb(j).bits.decode.ctrl.rfWen && rob.io.rvalid(2*i+1)){
        inst(i).src2Rdy := true.B
        inst(i).decode.data.src2 := cdb(j).bits.commits
      }
    })
  })

  // check dependency for insts at dispatch stage
  when(isDepend(inst(1).decode.ctrl.rfSrc1, inst(0).decode.ctrl.rfDest, inst(0).decode.ctrl.rfWen)){
    inst(1).src1Rdy := false.B
    inst(1).prfSrc1 := Cat(rob.io.index, 0.U)
  }
  when(isDepend(inst(1).decode.ctrl.rfSrc2, inst(0).decode.ctrl.rfDest, inst(0).decode.ctrl.rfWen)){
    inst(1).src2Rdy := false.B
    inst(1).prfSrc2 := Cat(rob.io.index, 0.U)
  }

  // fix src
  List.tabulate(DispatchWidth)(i => {
    when(io.in(i).bits.ctrl.src1Type === SrcType.pc){
      inst(i).src1Rdy := true.B
      inst(i).decode.data.src1 := SignExt(io.in(i).bits.cf.pc, AddrBits)
    }
    when(io.in(i).bits.ctrl.src2Type =/= SrcType.reg){
      inst(i).src2Rdy := true.B
      inst(i).decode.data.src2 := io.in(i).bits.data.imm
    }
  })

  //TODO: refactor src gen with Mux1H

  // We have to block store inst before we decouple load and store
  val hasBlockInst = List.tabulate(DispatchWidth)(i => io.in(i).bits.ctrl.noSpecExec || io.in(i).bits.ctrl.isBlocked)
  val pipeLineEmpty = rob.io.empty && alu1rs.io.empty && alu2rs.io.empty && csrrs.io.empty && lsurs.io.empty && mdurs.io.empty

  // Chicken Bit
  val Chicken = false
  if(Chicken){
    hasBlockInst(0) := true.B
    hasBlockInst(1) := true.B
  }

  val blockReg = RegInit(false.B)
  val haveUnfinishedStore = Wire(Bool())
  when((rob.io.empty || flushBackend) && !haveUnfinishedStore){ blockReg := false.B }
  when(io.in(0).bits.ctrl.isBlocked && io.in(0).fire){ blockReg := true.B }
  val mispredictionRecoveryReg = RegInit(false.B)
  when(io.redirect.valid && io.redirect.rtype === 1.U){ mispredictionRecoveryReg := true.B }
  when(rob.io.empty || flushBackend){ mispredictionRecoveryReg := false.B }
  val mispredictionRecovery = if(enableCheckpoint){
    io.redirect.valid && io.redirect.rtype === 1.U
  } else {
    mispredictionRecoveryReg && !rob.io.empty || io.redirect.valid && io.redirect.rtype === 1.U // waiting for misprediction recovery or misprediction detected
  }

  Debug(flushBackend, "flushbackend\n")
  Debug(io.redirect.valid && io.redirect.rtype === 1.U, "[REDIRECT] bpr start, redirect to %x\n", io.redirect.target)
  Debug(io.redirect.valid && io.redirect.rtype === 0.U, "[REDIRECT]special redirect to %x\n", io.redirect.target)

  instCango(0) :=
    io.in(0).valid &&
    rob.io.in(0).ready && // rob has empty slot
    !(hasBlockInst(0) && !pipeLineEmpty) &&
    !blockReg &&
    !mispredictionRecovery &&
    LookupTree(io.in(0).bits.ctrl.fuType, List(
      FuType.bru -> brurs.io.in.ready,
      FuType.alu -> alu1rs.io.in.ready,
      FuType.lsu -> lsurs.io.in.ready,
      FuType.mdu -> mdurs.io.in.ready,
      FuType.csr -> csrrs.io.in.ready,
      FuType.mou -> csrrs.io.in.ready
    ))
  instCango(1) :=
    instCango(0) &&
    io.in(1).valid &&
    rob.io.in(1).ready && // rob has empty slot
    !hasBlockInst(0) && // there is no block inst
    !hasBlockInst(1) &&
    !blockReg &&
    !mispredictionRecovery &&
    LookupTree(io.in(1).bits.ctrl.fuType, List(
      FuType.bru -> (brurs.io.in.ready && (bruCnt < 2.U)),
      FuType.alu -> (alu2rs.io.in.ready),
      FuType.lsu -> (lsurs.io.in.ready && (lsuCnt < 2.U)),
      FuType.mdu -> (mdurs.io.in.ready && (mduCnt < 2.U)),
      FuType.csr -> (csrrs.io.in.ready && (csrCnt < 2.U)),
      FuType.mou -> (csrrs.io.in.ready && (csrCnt < 2.U))
    ))
  instCango(2) := false.B
  assert(!(instCango(1) && !instCango(0))) // insts must be dispatched in seq

  val noInst = 2.U
  val bruInst  = Mux(inst(0).decode.ctrl.fuType === FuType.bru, 0.U, Mux(inst(1).decode.ctrl.fuType === FuType.bru, 1.U, noInst))
  val alu1Inst = Mux(inst(0).decode.ctrl.fuType === FuType.alu, 0.U, noInst)
  val alu2Inst = Mux(inst(1).decode.ctrl.fuType === FuType.alu, 1.U, noInst)
  val csrInst  = Mux(inst(0).decode.ctrl.fuType === FuType.csr || inst(0).decode.ctrl.fuType === FuType.mou, 0.U, noInst)
  val lsuInst  = Mux(inst(0).decode.ctrl.fuType === FuType.lsu, 0.U, Mux(inst(1).decode.ctrl.fuType === FuType.lsu, 1.U, noInst))
  val mduInst  = Mux(inst(0).decode.ctrl.fuType === FuType.mdu, 0.U, Mux(inst(1).decode.ctrl.fuType === FuType.mdu, 1.U, noInst))

  def updateBrMask(brMask: UInt) = {
    brMask & ~ (UIntToOH(mispredictRec.checkpoint) & Fill(checkpointSize, mispredictRec.valid))
  }

  val brMaskReg = RegInit(0.U(checkpointSize.W))
  val brMaskGen = updateBrMask(brMaskReg)
  val brMask = Wire(Vec(robWidth+2, UInt(checkpointSize.W)))
  val isBranch = List.tabulate(robWidth)(i => io.in(i).valid && io.in(i).bits.ctrl.fuType === FuType.bru)
  brMask(0) := brMaskGen
  brMask(1) := brMaskGen | (UIntToOH(brurs.io.updateCheckpoint.get.bits) & Fill(checkpointSize, io.in(0).fire && isBranch(0)))
  brMask(2) := DontCare
  brMask(3) := brMask(1) | (UIntToOH(brurs.io.updateCheckpoint.get.bits) & Fill(checkpointSize, io.in(1).fire && isBranch(1)))
  brMaskReg := Mux(flushBackend, 0.U, Mux(io.redirect.valid && io.redirect.rtype === 1.U, updateBrMask(bruDelayer.io.out.bits.brMask), brMask(3)))

  Debug("[brMask] %d: old %x -> new %x\n", GTimer(), brMaskReg, Mux(flushBackend, 0.U, brMask(2)))

  val rs = List(brurs, alu1rs, alu2rs, csrrs, lsurs, mdurs)
  val rsInstSel = List(bruInst, alu1Inst, alu2Inst, csrInst, lsuInst, mduInst)
  List.tabulate(rs.length)(i => {
    rs(i).io.in.valid := instCango(rsInstSel(i))
    rs(i).io.in.bits := inst(rsInstSel(i))
    rs(i).io.in.bits.brMask := brMask(rsInstSel(i))
    rs(i).io.cdb <> cdb
    rs(i).io.flush := flushBackend
    rs(i).io.mispredictRec := mispredictRec
  })

  List.tabulate(DispatchWidth)(i => {
    rob.io.in(i).valid := instCango(i)
    rob.io.in(i).bits := inst(i).decode
    rob.io.brMaskIn(i) := brMask(i)
  })

  // ------------------------------------------------
  // Backend stage 2
  // Issue
  // ------------------------------------------------

  // Backend exception regs

  val raiseBackendException = WireInit(false.B)
  val commitBackendException = WireInit(false.B)

  commitBackendException := rob.io.exception

  // Function Units

  val bru = Module(new ALU(hasBru = true))
  val brucommit = Wire(new OOCommitIO)
  val brucommitdelayed = Wire(new OOCommitIO)
  val bruOut = bru.access(
    valid = brurs.io.out.valid,
    src1 = brurs.io.out.bits.decode.data.src1,
    src2 = brurs.io.out.bits.decode.data.src2,
    func = brurs.io.out.bits.decode.ctrl.fuOpType
  )
  val bruWritebackReady = Wire(Bool())
  bru.io.cfIn := brurs.io.out.bits.decode.cf
  bru.io.offset := brurs.io.out.bits.decode.data.imm
  bru.io.out.ready := bruDelayer.io.in.ready
  brucommit.decode := brurs.io.out.bits.decode
  brucommit.isMMIO := false.B
  brucommit.intrNO := 0.U
  brucommit.commits := bruOut
  brucommit.prfidx := brurs.io.out.bits.prfDest
  brucommit.brMask := brurs.io.out.bits.brMask
  brucommit.decode.cf.redirect := bru.io.redirect
  brucommit.exception := false.B
  brucommit.store := false.B

  bruDelayer.io.in.bits := brucommit
  bruDelayer.io.in.valid := bru.io.out.valid
  bruDelayer.io.out.ready := bruWritebackReady
  bruDelayer.io.mispredictRec := mispredictRec
  bruDelayer.io.flush := io.flush
  bruDelayer.io.checkpointIn.get := brurs.io.recoverCheckpoint.get.bits
  brucommitdelayed := bruDelayer.io.out.bits

  // commit redirect
  bruRedirect := bruDelayer.io.out.bits.decode.cf.redirect
  bruRedirect.valid := bruDelayer.io.out.bits.decode.cf.redirect.valid && bruDelayer.io.out.fire
  mispredictRec.valid := bruDelayer.io.out.fire
  mispredictRec.checkpoint := bruDelayer.io.freeCheckpoint.get.bits
  mispredictRec.prfidx := bruDelayer.io.out.bits.prfidx
  mispredictRec.redirect := bruRedirect

  val alu1 = Module(new ALUEP())
  alu1rs.io.out <> alu1.io.in
  alu1.io.flush := io.flush
  alu1.io.mispredictRec := mispredictRec

  val alu2 = Module(new ALUEP())
  alu2rs.io.out <> alu2.io.in
  alu2.io.flush := io.flush
  alu2.io.mispredictRec := mispredictRec

  val lsu = Module(new LSU)
  val lsucommit = Wire(new OOCommitIO)
  val lsuTlbPF = WireInit(false.B)

  val lsuUop = lsurs.io.out.bits

  val lsuOut = lsu.access(
    valid = lsurs.io.out.valid,
    src1 = lsuUop.decode.data.src1,
    src2 = lsuUop.decode.data.imm,
    func = lsuUop.decode.ctrl.fuOpType,
    dtlbPF = lsuTlbPF
  )
  lsu.io.uopIn := lsuUop
  lsu.io.stMaskIn := lsurs.io.stMaskOut.get
  lsu.io.robAllocate.valid := io.in(0).fire
  lsu.io.robAllocate.bits := rob.io.index
  lsu.io.mispredictRec := mispredictRec
  lsu.io.scommit := rob.io.scommit
  haveUnfinishedStore := lsu.io.haveUnfinishedStore
  lsu.io.flush := flushBackend
  lsu.io.wdata := lsuUop.decode.data.src2
  // lsu.io.instr := lsuUop.decode.cf.instr
  io.dmem <> lsu.io.dmem
  io.dtlb <> lsu.io.dtlb
  BoringUtils.addSource(io.memMMU.dmem.loadPF, "loadPF") // FIXIT: this is nasty
  BoringUtils.addSource(io.memMMU.dmem.storePF, "storePF") // FIXIT: this is nasty
  lsu.io.out.ready := true.B //TODO
  lsucommit.decode := lsu.io.uopOut.decode
  lsucommit.isMMIO := lsu.io.isMMIO
  lsucommit.commits := lsuOut
  lsucommit.prfidx := lsu.io.uopOut.prfDest
  lsucommit.exception := lsu.io.exceptionVec.asUInt.orR
  lsucommit.store := lsu.io.commitStoreToCDB
  lsucommit.brMask := DontCare // FIXIT: gen lsucommit in LSU
  // fix exceptionVec
  lsucommit.decode.cf.exceptionVec := lsu.io.exceptionVec

  // backend exceptions only come from LSU
  raiseBackendException := lsucommit.exception && lsu.io.out.fire
  // for backend exceptions, we reuse 'intrNO' field in ROB
  // when ROB.exception(x) === 1, intrNO(x) represents backend exception vec for this inst
  lsucommit.intrNO := lsucommit.decode.cf.exceptionVec.asUInt

  // NutShell MDU is not pipelined, we can not wrap it into "Execution Pipeline"
  // TODO: update MDU
  val mdu = Module(new MDU)
  val mducommit = Wire(new OOCommitIO)
  val mducommitdelayed = Wire(new OOCommitIO)
  val mduOut = mdu.access(
    valid = mdurs.io.out.valid,
    src1 = mdurs.io.out.bits.decode.data.src1,
    src2 = mdurs.io.out.bits.decode.data.src2,
    func = mdurs.io.out.bits.decode.ctrl.fuOpType
  )
  val mduWritebackReady = Wire(Bool())
  mdu.io.out.ready := mduDelayer.io.in.ready
  mducommit.decode := mdurs.io.out.bits.decode
  mducommit.isMMIO := false.B
  mducommit.intrNO := 0.U
  mducommit.commits := mduOut
  mducommit.prfidx := mdurs.io.out.bits.prfDest
  mducommit.decode.cf.redirect.valid := false.B
  mducommit.decode.cf.redirect.rtype := DontCare
  mducommit.exception := false.B
  mducommit.store := false.B
  mducommit.brMask := mdurs.io.out.bits.brMask
  mdurs.io.commit.get := mdu.io.out.valid

  // assert(!(mdu.io.out.valid && !mduDelayer.io.in.ready))
  mduDelayer.io.in.bits := mducommit
  mduDelayer.io.in.valid := mdu.io.out.valid && mdurs.io.out.valid
  mduDelayer.io.out.ready := mduWritebackReady
  mduDelayer.io.mispredictRec := mispredictRec
  mduDelayer.io.flush := io.flush
  mducommitdelayed := mduDelayer.io.out.bits

  val csr = Module(new CSR)
  assert(!(csrrs.io.out.valid && csrrs.io.out.bits.decode.ctrl.fuType === FuType.csr && commitBackendException))
  val csrVaild = csrrs.io.out.valid && csrrs.io.out.bits.decode.ctrl.fuType === FuType.csr || commitBackendException
  val csrUop = WireInit(csrrs.io.out.bits)
  when(commitBackendException){
    csrUop := rob.io.beUop
  }
  val csrcommit = Wire(new OOCommitIO)
  val csrOut = csr.access(
    valid = csrVaild,
    src1 = csrUop.decode.data.src1,
    src2 = csrUop.decode.data.src2,
    func = csrUop.decode.ctrl.fuOpType
  )
  csr.io.cfIn := csrUop.decode.cf
  csr.io.instrValid := csrVaild && !flushBackend
  csr.io.isBackendException := commitBackendException
  csr.io.out.ready := true.B
  csrcommit.decode := csrUop.decode
  csrcommit.isMMIO := false.B
  csrcommit.intrNO := csr.io.intrNO
  csrcommit.commits := csrOut
  csrcommit.prfidx := csrUop.prfDest
  csrcommit.decode.cf.redirect := csr.io.redirect
  csrcommit.exception := false.B
  csrcommit.store := false.B
  csrcommit.brMask := DontCare //FIXIT
  // fix wen
  when(csr.io.wenFix){csrcommit.decode.ctrl.rfWen := false.B}

  csr.io.imemMMU <> io.memMMU.imem
  csr.io.dmemMMU <> io.memMMU.dmem

  Debug(csrVaild && commitBackendException, "[BACKEND EXC] pc %x inst %x evec %b\n", csrUop.decode.cf.pc, csrUop.decode.cf.instr, csrUop.decode.cf.exceptionVec.asUInt)

  val mou = Module(new MOU)
  val moucommit = Wire(new OOCommitIO)
  // mou does not write register
  mou.access(
    valid = csrrs.io.out.valid && csrrs.io.out.bits.decode.ctrl.fuType === FuType.mou,
    src1 = csrrs.io.out.bits.decode.data.src1,
    src2 = csrrs.io.out.bits.decode.data.src2,
    func = csrrs.io.out.bits.decode.ctrl.fuOpType
  )
  mou.io.cfIn := csrrs.io.out.bits.decode.cf
  mou.io.out.ready := true.B // mou will stall the pipeline
  moucommit.decode := csrrs.io.out.bits.decode
  moucommit.isMMIO := false.B
  moucommit.intrNO := 0.U
  moucommit.commits := DontCare
  moucommit.prfidx := csrrs.io.out.bits.prfDest
  moucommit.decode.cf.redirect := mou.io.redirect
  moucommit.exception := false.B
  moucommit.store := false.B
  moucommit.brMask := DontCare //FIXIT

  // ------------------------------------------------
  // Backend stage 3+
  // Exec
  // ------------------------------------------------

  // ------------------------------------------------
  // Backend final stage
  // Commit to CDB (i.e. Writeback)
  // ------------------------------------------------

  // Common Data Bus
  //
  // Currently, FUs can commit to any CDB socket.
  //
  // Alternatively, FUs can be divided into different groups.
  // For each group, only one inst can be commited to ROB in a single cycle.

  val nullCommit = Wire(new OOCommitIO)
  nullCommit := DontCare

  // CDB arbit
  val (srcBRU, srcALU1, srcALU2, srcLSU, srcMDU, srcCSR, srcMOU, srcNone) = (0, 1, 2, 3, 4, 5, 6, 7)
  val commit = List(brucommitdelayed, alu1.io.out.bits, alu2.io.out.bits, lsucommit, mducommitdelayed, csrcommit, moucommit, nullCommit)
  val commitValid = List(bruDelayer.io.out.valid, alu1.io.out.valid, alu2.io.out.valid, lsu.io.out.valid, mduDelayer.io.out.valid, csr.io.out.valid, mou.io.out.valid, false.B)

  val WritebackPriority = Seq(
    srcCSR,
    srcMOU,
    srcLSU,
    srcMDU,
    srcBRU,
    srcALU1,
    srcALU2,
    srcNone
  )

  // select 2 CDB commit request with highest priority
  val commitPriority = VecInit(WritebackPriority.map(i => commit(i)))
  val commitValidPriority = VecInit(WritebackPriority.map(i => commitValid(i)))
  // val secondValidMask = VecInit((0 until WritebackPriority.size).map(i => WritebackPriority(0 until i).map(j => commitValid(j)).reduceLeft(_ ^ _)))
  val notFirstMask = Wire(Vec(WritebackPriority.size, Bool()))
  notFirstMask(0) := false.B
  for(i <- 0 until WritebackPriority.size){
    if(i != 0){notFirstMask(i) := notFirstMask(i-1) | commitValidPriority(i-1)}
  }
  val secondCommitValid = commitValidPriority.asUInt & notFirstMask.asUInt
  val notSecondMask = Wire(Vec(WritebackPriority.size, Bool()))
  notSecondMask(0) := false.B
  for(i <- 0 until WritebackPriority.size){
    if(i != 0){notSecondMask(i) := notSecondMask(i-1) | secondCommitValid(i-1)}
  }
  val commitValidVec = commitValidPriority.asUInt & ~notSecondMask.asUInt

  Debug("[CDB Arb] %b %b %b %b %b\n", commitValidPriority.asUInt, notFirstMask.asUInt, secondCommitValid, notSecondMask.asUInt, commitValidVec)

  val cdbSrc1 = PriorityMux(commitValidPriority, commitPriority)
  val cdbSrc1Valid = PriorityMux(commitValidPriority, commitValidPriority)
  val cdbSrc2 = PriorityMux(secondCommitValid, commitPriority)
  val cdbSrc2Valid = PriorityMux(secondCommitValid, commitValidPriority)

  val cmtStrHaz = List(
    PopCount(commitValidPriority.asUInt) === 0.U,
    PopCount(commitValidPriority.asUInt) === 1.U,
    PopCount(commitValidPriority.asUInt) === 2.U,
    PopCount(commitValidPriority.asUInt) === 3.U,
    PopCount(commitValidPriority.asUInt) > 3.U
  )
  val commitValidPriorityUInt = commitValidPriority.asUInt
  assert(!(PopCount(commitValidPriorityUInt(3,0)) > 2.U))

  cdb(0).valid := cdbSrc1Valid
  cdb(0).bits := cdbSrc1
  // cdb(0).ready := true.B
  cdb(1).valid := cdbSrc2Valid
  cdb(1).bits := cdbSrc2
  // cdb(1).ready := true.B

  mduWritebackReady  := commitValidVec(WritebackPriority.indexOf(srcMDU))
  bruWritebackReady  := commitValidVec(WritebackPriority.indexOf(srcBRU))
  alu1.io.out.ready := commitValidVec(WritebackPriority.indexOf(srcALU1))
  alu2.io.out.ready := commitValidVec(WritebackPriority.indexOf(srcALU2))

  brurs.io.out.ready  := bru.io.in.ready
  alu1rs.io.out.ready := alu1.io.in.ready
  alu2rs.io.out.ready := alu2.io.in.ready
  csrrs.io.out.ready := csr.io.in.ready
  lsurs.io.out.ready := lsu.io.in.ready
  mdurs.io.out.ready := mdu.io.in.ready

  Debug(flushBackend, "[FLUSH]\n")
  Debug(io.redirect.valid, "[RDIRECT] target 0x%x\n", io.redirect.target)

  // Performance Counter

  BoringUtils.addSource(WireInit(alu1.io.out.fire), "perfCntCondMaluInstr")
  BoringUtils.addSource(WireInit(bru.io.out.fire), "perfCntCondMbruInstr")
  BoringUtils.addSource(WireInit(lsu.io.out.fire), "perfCntCondMlsuInstr")
  BoringUtils.addSource(WireInit(mdu.io.out.fire), "perfCntCondMmduInstr")
  BoringUtils.addSource(WireInit(!rob.io.in(0).ready), "perfCntCondMrobFull")
  BoringUtils.addSource(WireInit(!alu1rs.io.in.ready), "perfCntCondMalu1rsFull")
  BoringUtils.addSource(WireInit(!alu2rs.io.in.ready), "perfCntCondMalu2rsFull")
  BoringUtils.addSource(WireInit(!brurs.io.in.ready), "perfCntCondMbrursFull")
  BoringUtils.addSource(WireInit(!lsurs.io.in.ready), "perfCntCondMlsursFull")
  BoringUtils.addSource(WireInit(!mdurs.io.in.ready), "perfCntCondMmdursFull")
  BoringUtils.addSource(WireInit(lsurs.io.out.fire), "perfCntCondMlsuIssue")
  BoringUtils.addSource(WireInit(mdurs.io.out.fire), "perfCntCondMmduIssue")
  BoringUtils.addSource(rob.io.empty, "perfCntCondMrobEmpty")
  BoringUtils.addSource(WireInit(cmtStrHaz(0)), "perfCntCondMcmtCnt0")
  BoringUtils.addSource(WireInit(cmtStrHaz(1)), "perfCntCondMcmtCnt1")
  BoringUtils.addSource(WireInit(cmtStrHaz(2)), "perfCntCondMcmtCnt2")
  BoringUtils.addSource(WireInit(cmtStrHaz(3)), "perfCntCondMcmtStrHaz1")
  BoringUtils.addSource(WireInit(cmtStrHaz(4)), "perfCntCondMcmtStrHaz2")
  BoringUtils.addSource(WireInit(alu2.io.out.fire), "perfCntCondMaluInstr2")
  BoringUtils.addSource(WireInit(!(rob.io.in(0).fire | rob.io.in(1).fire)), "perfCntCondMdispatch0")
  BoringUtils.addSource(WireInit(rob.io.in(0).fire ^ rob.io.in(1).fire), "perfCntCondMdispatch1")
  BoringUtils.addSource(WireInit(rob.io.in(0).fire & rob.io.in(1).fire), "perfCntCondMdispatch2")

  val inst1RSfull = !LookupTree(io.in(0).bits.ctrl.fuType, List(
    FuType.bru -> brurs.io.in.ready,
    FuType.alu -> alu1rs.io.in.ready,
    FuType.lsu -> lsurs.io.in.ready,
    FuType.mdu -> mdurs.io.in.ready,
    FuType.csr -> csrrs.io.in.ready,
    FuType.mou -> csrrs.io.in.ready
  ))
  val inst2RSfull = !LookupTree(io.in(1).bits.ctrl.fuType, List(
    FuType.bru -> brurs.io.in.ready,
    FuType.alu -> alu2rs.io.in.ready,
    FuType.lsu -> lsurs.io.in.ready,
    FuType.mdu -> mdurs.io.in.ready,
    FuType.csr -> csrrs.io.in.ready,
    FuType.mou -> csrrs.io.in.ready
  ))
  val dispatchConflict = bruCnt > 1.U || lsuCnt > 1.U || mduCnt > 1.U || csrCnt > 1.U
  BoringUtils.addSource(io.in(0).valid && (hasBlockInst(0) && !pipeLineEmpty || blockReg), "perfCntCondMdp1StBlk")
  BoringUtils.addSource(io.in(0).valid && inst1RSfull, "perfCntCondMdp1StRSf")
  BoringUtils.addSource(io.in(0).valid && !rob.io.in(0).ready, "perfCntCondMdp1StROBf")
  BoringUtils.addSource(dispatchConflict, "perfCntCondMdp1StConf")
  BoringUtils.addSource(io.in(0).valid && !instCango(0), "perfCntCondMdp1StCnt")

  BoringUtils.addSource(io.in(1).valid && (hasBlockInst(0) || hasBlockInst(1) || blockReg), "perfCntCondMdp2StBlk")
  BoringUtils.addSource(io.in(1).valid && inst2RSfull, "perfCntCondMdp2StRSf")
  BoringUtils.addSource(io.in(1).valid && !rob.io.in(1).ready, "perfCntCondMdp2StROBf")
  BoringUtils.addSource(dispatchConflict, "perfCntCondMdp2StConf")
  BoringUtils.addSource(io.in(1).valid && !instCango(0), "perfCntCondMdp2StSeq")
  BoringUtils.addSource(io.in(1).valid && !instCango(1), "perfCntCondMdp2StCnt")
  BoringUtils.addSource(!io.in(0).valid, "perfCntCondMdpNoInst")

  if (!p.FPGAPlatform || p.FPGADifftest) {
    val difftest = DifftestModule(new DiffPhyIntRegState(NRReg)) // Size = NRREG, use as ArchIntReg
    difftest.coreid := 0.U
    difftest.value := VecInit(rf.read_all)
  }

  if (!p.FPGAPlatform || p.FPGADifftest) {
    val cycleCnt = WireInit(0.U(XLEN.W))
    val instrCnt = WireInit(0.U(XLEN.W))
    val nutcoretrap = WireInit(csrrs.io.out.bits.decode.ctrl.isNutCoreTrap && csrrs.io.out.valid)

    BoringUtils.addSink(cycleCnt, "simCycleCnt")
    BoringUtils.addSink(instrCnt, "simInstrCnt")
    BoringUtils.addSource(nutcoretrap, "nutcoretrap")

    val difftest = DifftestModule(new DiffTrapEvent)
    difftest.coreid   := 0.U // TODO: nutshell does not support coreid auto config
    difftest.hasTrap  := nutcoretrap
    difftest.code     := csrrs.io.out.bits.decode.data.src1
    difftest.pc       := csrrs.io.out.bits.decode.cf.pc
    difftest.cycleCnt := cycleCnt
    difftest.instrCnt := instrCnt
  }

}

class Backend_inorder(implicit val p: NutCoreConfig) extends NutCoreModule {
  val io = IO(new Bundle {
    val in = Vec(2, Flipped(Decoupled(new DecodeIO)))
    val flush = Input(UInt(2.W))
    val dmem = new SimpleBusUC(addrBits = VAddrBits)
    val memMMU = Flipped(new MemMMUIO)

    val redirect = new RedirectIO
  })

  val isu  = Module(new ISU)
  val exu  = Module(new EXU)
  val wbu  = Module(new WBU)

  PipelineConnect(isu.io.out, exu.io.in, exu.io.out.fire, io.flush(0))
  PipelineConnect(exu.io.out, wbu.io.in, true.B, io.flush(1))

  isu.io.in <> io.in

  isu.io.flush := io.flush(0)
  exu.io.flush := io.flush(1)

  isu.io.wb <> wbu.io.wb
  io.redirect <> wbu.io.redirect
  // forward
  isu.io.forward <> exu.io.forward

  io.memMMU.imem <> exu.io.memMMU.imem
  io.memMMU.dmem <> exu.io.memMMU.dmem
  io.dmem <> exu.io.dmem
}
